import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
train1 = pd.read_csv("train.csv", index_col=0, keep_date_col=True)
test1 = pd.read_csv("test.csv", index_col=0)
df0 = pd.concat([train1, test1])
df0
| date | store_nbr | family | sales | onpromotion | |
|---|---|---|---|---|---|
| id | |||||
| 0 | 2013-01-01 | 1 | AUTOMOTIVE | 0.0 | 0 |
| 1 | 2013-01-01 | 1 | BABY CARE | 0.0 | 0 |
| 2 | 2013-01-01 | 1 | BEAUTY | 0.0 | 0 |
| 3 | 2013-01-01 | 1 | BEVERAGES | 0.0 | 0 |
| 4 | 2013-01-01 | 1 | BOOKS | 0.0 | 0 |
| ... | ... | ... | ... | ... | ... |
| 3029395 | 2017-08-31 | 9 | POULTRY | NaN | 1 |
| 3029396 | 2017-08-31 | 9 | PREPARED FOODS | NaN | 0 |
| 3029397 | 2017-08-31 | 9 | PRODUCE | NaN | 1 |
| 3029398 | 2017-08-31 | 9 | SCHOOL AND OFFICE SUPPLIES | NaN | 9 |
| 3029399 | 2017-08-31 | 9 | SEAFOOD | NaN | 0 |
3029400 rows × 5 columns
gas = pd.read_csv("oil.csv", names=['date','gas_price'], header=0)
date_range = pd.period_range(start="2013-01-01", end="2017-08-31")
date_range1 = date_range.astype(str, copy=False)
calendar = pd.DataFrame(date_range1, columns= ["date"])
calendar.dtypes
date object dtype: object
oil = pd.merge(left=calendar, right = gas, how = "left", on="date")
oil
| date | gas_price | |
|---|---|---|
| 0 | 2013-01-01 | NaN |
| 1 | 2013-01-02 | 93.14 |
| 2 | 2013-01-03 | 92.97 |
| 3 | 2013-01-04 | 93.12 |
| 4 | 2013-01-05 | NaN |
| ... | ... | ... |
| 1699 | 2017-08-27 | NaN |
| 1700 | 2017-08-28 | 46.40 |
| 1701 | 2017-08-29 | 46.46 |
| 1702 | 2017-08-30 | 45.96 |
| 1703 | 2017-08-31 | 47.26 |
1704 rows × 2 columns
interpolated_oil = oil.interpolate(method="linear")
interpolated_oil.fillna(93.140000, inplace=True)
interpolated_oil
| date | gas_price | |
|---|---|---|
| 0 | 2013-01-01 | 93.140000 |
| 1 | 2013-01-02 | 93.140000 |
| 2 | 2013-01-03 | 92.970000 |
| 3 | 2013-01-04 | 93.120000 |
| 4 | 2013-01-05 | 93.146667 |
| ... | ... | ... |
| 1699 | 2017-08-27 | 46.816667 |
| 1700 | 2017-08-28 | 46.400000 |
| 1701 | 2017-08-29 | 46.460000 |
| 1702 | 2017-08-30 | 45.960000 |
| 1703 | 2017-08-31 | 47.260000 |
1704 rows × 2 columns
df1 = pd.merge(left=df0, right=interpolated_oil, how="left", on="date")
df1
| date | store_nbr | family | sales | onpromotion | gas_price | |
|---|---|---|---|---|---|---|
| 0 | 2013-01-01 | 1 | AUTOMOTIVE | 0.0 | 0 | 93.14 |
| 1 | 2013-01-01 | 1 | BABY CARE | 0.0 | 0 | 93.14 |
| 2 | 2013-01-01 | 1 | BEAUTY | 0.0 | 0 | 93.14 |
| 3 | 2013-01-01 | 1 | BEVERAGES | 0.0 | 0 | 93.14 |
| 4 | 2013-01-01 | 1 | BOOKS | 0.0 | 0 | 93.14 |
| ... | ... | ... | ... | ... | ... | ... |
| 3029395 | 2017-08-31 | 9 | POULTRY | NaN | 1 | 47.26 |
| 3029396 | 2017-08-31 | 9 | PREPARED FOODS | NaN | 0 | 47.26 |
| 3029397 | 2017-08-31 | 9 | PRODUCE | NaN | 1 | 47.26 |
| 3029398 | 2017-08-31 | 9 | SCHOOL AND OFFICE SUPPLIES | NaN | 9 | 47.26 |
| 3029399 | 2017-08-31 | 9 | SEAFOOD | NaN | 0 | 47.26 |
3029400 rows × 6 columns
stores = pd.read_csv("stores.csv")
stores
| store_nbr | city | state | type | cluster | |
|---|---|---|---|---|---|
| 0 | 1 | Quito | Pichincha | D | 13 |
| 1 | 2 | Quito | Pichincha | D | 13 |
| 2 | 3 | Quito | Pichincha | D | 8 |
| 3 | 4 | Quito | Pichincha | D | 9 |
| 4 | 5 | Santo Domingo | Santo Domingo de los Tsachilas | D | 4 |
| 5 | 6 | Quito | Pichincha | D | 13 |
| 6 | 7 | Quito | Pichincha | D | 8 |
| 7 | 8 | Quito | Pichincha | D | 8 |
| 8 | 9 | Quito | Pichincha | B | 6 |
| 9 | 10 | Quito | Pichincha | C | 15 |
| 10 | 11 | Cayambe | Pichincha | B | 6 |
| 11 | 12 | Latacunga | Cotopaxi | C | 15 |
| 12 | 13 | Latacunga | Cotopaxi | C | 15 |
| 13 | 14 | Riobamba | Chimborazo | C | 7 |
| 14 | 15 | Ibarra | Imbabura | C | 15 |
| 15 | 16 | Santo Domingo | Santo Domingo de los Tsachilas | C | 3 |
| 16 | 17 | Quito | Pichincha | C | 12 |
| 17 | 18 | Quito | Pichincha | B | 16 |
| 18 | 19 | Guaranda | Bolivar | C | 15 |
| 19 | 20 | Quito | Pichincha | B | 6 |
| 20 | 21 | Santo Domingo | Santo Domingo de los Tsachilas | B | 6 |
| 21 | 22 | Puyo | Pastaza | C | 7 |
| 22 | 23 | Ambato | Tungurahua | D | 9 |
| 23 | 24 | Guayaquil | Guayas | D | 1 |
| 24 | 25 | Salinas | Santa Elena | D | 1 |
| 25 | 26 | Guayaquil | Guayas | D | 10 |
| 26 | 27 | Daule | Guayas | D | 1 |
| 27 | 28 | Guayaquil | Guayas | E | 10 |
| 28 | 29 | Guayaquil | Guayas | E | 10 |
| 29 | 30 | Guayaquil | Guayas | C | 3 |
| 30 | 31 | Babahoyo | Los Rios | B | 10 |
| 31 | 32 | Guayaquil | Guayas | C | 3 |
| 32 | 33 | Quevedo | Los Rios | C | 3 |
| 33 | 34 | Guayaquil | Guayas | B | 6 |
| 34 | 35 | Playas | Guayas | C | 3 |
| 35 | 36 | Libertad | Guayas | E | 10 |
| 36 | 37 | Cuenca | Azuay | D | 2 |
| 37 | 38 | Loja | Loja | D | 4 |
| 38 | 39 | Cuenca | Azuay | B | 6 |
| 39 | 40 | Machala | El Oro | C | 3 |
| 40 | 41 | Machala | El Oro | D | 4 |
| 41 | 42 | Cuenca | Azuay | D | 2 |
| 42 | 43 | Esmeraldas | Esmeraldas | E | 10 |
| 43 | 44 | Quito | Pichincha | A | 5 |
| 44 | 45 | Quito | Pichincha | A | 11 |
| 45 | 46 | Quito | Pichincha | A | 14 |
| 46 | 47 | Quito | Pichincha | A | 14 |
| 47 | 48 | Quito | Pichincha | A | 14 |
| 48 | 49 | Quito | Pichincha | A | 11 |
| 49 | 50 | Ambato | Tungurahua | A | 14 |
| 50 | 51 | Guayaquil | Guayas | A | 17 |
| 51 | 52 | Manta | Manabi | A | 11 |
| 52 | 53 | Manta | Manabi | D | 13 |
| 53 | 54 | El Carmen | Manabi | C | 3 |
df2 = pd.merge(left=df1, right=stores, how="left", on="store_nbr")
df2
| date | store_nbr | family | sales | onpromotion | gas_price | city | state | type | cluster | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2013-01-01 | 1 | AUTOMOTIVE | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 |
| 1 | 2013-01-01 | 1 | BABY CARE | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 |
| 2 | 2013-01-01 | 1 | BEAUTY | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 |
| 3 | 2013-01-01 | 1 | BEVERAGES | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 |
| 4 | 2013-01-01 | 1 | BOOKS | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3029395 | 2017-08-31 | 9 | POULTRY | NaN | 1 | 47.26 | Quito | Pichincha | B | 6 |
| 3029396 | 2017-08-31 | 9 | PREPARED FOODS | NaN | 0 | 47.26 | Quito | Pichincha | B | 6 |
| 3029397 | 2017-08-31 | 9 | PRODUCE | NaN | 1 | 47.26 | Quito | Pichincha | B | 6 |
| 3029398 | 2017-08-31 | 9 | SCHOOL AND OFFICE SUPPLIES | NaN | 9 | 47.26 | Quito | Pichincha | B | 6 |
| 3029399 | 2017-08-31 | 9 | SEAFOOD | NaN | 0 | 47.26 | Quito | Pichincha | B | 6 |
3029400 rows × 10 columns
holiday = pd.read_csv("holidays_events.csv")
holiday
| date | type | locale | locale_name | description | transferred | |
|---|---|---|---|---|---|---|
| 0 | 2012-03-02 | Holiday | Local | Manta | Fundacion de Manta | False |
| 1 | 2012-04-01 | Holiday | Regional | Cotopaxi | Provincializacion de Cotopaxi | False |
| 2 | 2012-04-12 | Holiday | Local | Cuenca | Fundacion de Cuenca | False |
| 3 | 2012-04-14 | Holiday | Local | Libertad | Cantonizacion de Libertad | False |
| 4 | 2012-04-21 | Holiday | Local | Riobamba | Cantonizacion de Riobamba | False |
| ... | ... | ... | ... | ... | ... | ... |
| 345 | 2017-12-22 | Additional | National | Ecuador | Navidad-3 | False |
| 346 | 2017-12-23 | Additional | National | Ecuador | Navidad-2 | False |
| 347 | 2017-12-24 | Additional | National | Ecuador | Navidad-1 | False |
| 348 | 2017-12-25 | Holiday | National | Ecuador | Navidad | False |
| 349 | 2017-12-26 | Additional | National | Ecuador | Navidad+1 | False |
350 rows × 6 columns
holiday_national=holiday.loc[holiday['locale_name'] == "Ecuador" , :]
holiday_national
| date | type | locale | locale_name | description | transferred | |
|---|---|---|---|---|---|---|
| 14 | 2012-08-10 | Holiday | National | Ecuador | Primer Grito de Independencia | False |
| 19 | 2012-10-09 | Holiday | National | Ecuador | Independencia de Guayaquil | True |
| 20 | 2012-10-12 | Transfer | National | Ecuador | Traslado Independencia de Guayaquil | False |
| 21 | 2012-11-02 | Holiday | National | Ecuador | Dia de Difuntos | False |
| 22 | 2012-11-03 | Holiday | National | Ecuador | Independencia de Cuenca | False |
| ... | ... | ... | ... | ... | ... | ... |
| 345 | 2017-12-22 | Additional | National | Ecuador | Navidad-3 | False |
| 346 | 2017-12-23 | Additional | National | Ecuador | Navidad-2 | False |
| 347 | 2017-12-24 | Additional | National | Ecuador | Navidad-1 | False |
| 348 | 2017-12-25 | Holiday | National | Ecuador | Navidad | False |
| 349 | 2017-12-26 | Additional | National | Ecuador | Navidad+1 | False |
174 rows × 6 columns
locale_name = set(holiday.locale_name)
holiday.locale_name.value_counts()
Ecuador 174 Quito 13 Riobamba 12 Guaranda 12 Latacunga 12 Ambato 12 Guayaquil 11 Cuenca 7 Ibarra 7 Salinas 6 Loja 6 Santa Elena 6 Santo Domingo de los Tsachilas 6 Quevedo 6 Manta 6 Esmeraldas 6 Cotopaxi 6 El Carmen 6 Santo Domingo 6 Machala 6 Imbabura 6 Puyo 6 Libertad 6 Cayambe 6 Name: locale_name, dtype: int64
city_name = set(df2.city)
df2.city.value_counts()
Quito 1009800 Guayaquil 448800 Santo Domingo 168300 Cuenca 168300 Latacunga 112200 Manta 112200 Machala 112200 Ambato 112200 Quevedo 56100 Esmeraldas 56100 Loja 56100 Libertad 56100 Playas 56100 Daule 56100 Babahoyo 56100 Cayambe 56100 Salinas 56100 Puyo 56100 Guaranda 56100 Ibarra 56100 Riobamba 56100 El Carmen 56100 Name: city, dtype: int64
city_name & locale_name
{'Ambato',
'Cayambe',
'Cuenca',
'El Carmen',
'Esmeraldas',
'Guaranda',
'Guayaquil',
'Ibarra',
'Latacunga',
'Libertad',
'Loja',
'Machala',
'Manta',
'Puyo',
'Quevedo',
'Quito',
'Riobamba',
'Salinas',
'Santo Domingo'}
df2.state.value_counts()
Pichincha 1065900 Guayas 617100 Santo Domingo de los Tsachilas 168300 Azuay 168300 Manabi 168300 Cotopaxi 112200 Tungurahua 112200 Los Rios 112200 El Oro 112200 Chimborazo 56100 Imbabura 56100 Bolivar 56100 Pastaza 56100 Santa Elena 56100 Loja 56100 Esmeraldas 56100 Name: state, dtype: int64
set(df2.state) & locale_name
{'Cotopaxi',
'Esmeraldas',
'Imbabura',
'Loja',
'Santa Elena',
'Santo Domingo de los Tsachilas'}
holiday
| date | type | locale | locale_name | description | transferred | |
|---|---|---|---|---|---|---|
| 0 | 2012-03-02 | Holiday | Local | Manta | Fundacion de Manta | False |
| 1 | 2012-04-01 | Holiday | Regional | Cotopaxi | Provincializacion de Cotopaxi | False |
| 2 | 2012-04-12 | Holiday | Local | Cuenca | Fundacion de Cuenca | False |
| 3 | 2012-04-14 | Holiday | Local | Libertad | Cantonizacion de Libertad | False |
| 4 | 2012-04-21 | Holiday | Local | Riobamba | Cantonizacion de Riobamba | False |
| ... | ... | ... | ... | ... | ... | ... |
| 345 | 2017-12-22 | Additional | National | Ecuador | Navidad-3 | False |
| 346 | 2017-12-23 | Additional | National | Ecuador | Navidad-2 | False |
| 347 | 2017-12-24 | Additional | National | Ecuador | Navidad-1 | False |
| 348 | 2017-12-25 | Holiday | National | Ecuador | Navidad | False |
| 349 | 2017-12-26 | Additional | National | Ecuador | Navidad+1 | False |
350 rows × 6 columns
# add national holidays:
df3_national = pd.merge(left=df2, right=holiday_national[["date","locale"]], how="left", on="date")
df3_national
| date | store_nbr | family | sales | onpromotion | gas_price | city | state | type | cluster | locale | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2013-01-01 | 1 | AUTOMOTIVE | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National |
| 1 | 2013-01-01 | 1 | BABY CARE | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National |
| 2 | 2013-01-01 | 1 | BEAUTY | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National |
| 3 | 2013-01-01 | 1 | BEVERAGES | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National |
| 4 | 2013-01-01 | 1 | BOOKS | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3036523 | 2017-08-31 | 9 | POULTRY | NaN | 1 | 47.26 | Quito | Pichincha | B | 6 | NaN |
| 3036524 | 2017-08-31 | 9 | PREPARED FOODS | NaN | 0 | 47.26 | Quito | Pichincha | B | 6 | NaN |
| 3036525 | 2017-08-31 | 9 | PRODUCE | NaN | 1 | 47.26 | Quito | Pichincha | B | 6 | NaN |
| 3036526 | 2017-08-31 | 9 | SCHOOL AND OFFICE SUPPLIES | NaN | 9 | 47.26 | Quito | Pichincha | B | 6 | NaN |
| 3036527 | 2017-08-31 | 9 | SEAFOOD | NaN | 0 | 47.26 | Quito | Pichincha | B | 6 | NaN |
3036528 rows × 11 columns
df3_national['locale'] = df3_national['locale'].fillna(0)
df3_national['national_holiday'] = df3_national['locale'].map(lambda x: 'No' if x==0 else 'Yes')
df3_national
| date | store_nbr | family | sales | onpromotion | gas_price | city | state | type | cluster | locale | national_holiday | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2013-01-01 | 1 | AUTOMOTIVE | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National | Yes |
| 1 | 2013-01-01 | 1 | BABY CARE | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National | Yes |
| 2 | 2013-01-01 | 1 | BEAUTY | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National | Yes |
| 3 | 2013-01-01 | 1 | BEVERAGES | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National | Yes |
| 4 | 2013-01-01 | 1 | BOOKS | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National | Yes |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3036523 | 2017-08-31 | 9 | POULTRY | NaN | 1 | 47.26 | Quito | Pichincha | B | 6 | 0 | No |
| 3036524 | 2017-08-31 | 9 | PREPARED FOODS | NaN | 0 | 47.26 | Quito | Pichincha | B | 6 | 0 | No |
| 3036525 | 2017-08-31 | 9 | PRODUCE | NaN | 1 | 47.26 | Quito | Pichincha | B | 6 | 0 | No |
| 3036526 | 2017-08-31 | 9 | SCHOOL AND OFFICE SUPPLIES | NaN | 9 | 47.26 | Quito | Pichincha | B | 6 | 0 | No |
| 3036527 | 2017-08-31 | 9 | SEAFOOD | NaN | 0 | 47.26 | Quito | Pichincha | B | 6 | 0 | No |
3036528 rows × 12 columns
# add regional holidays:
holiday_regional=holiday.loc[holiday['locale'] == "Regional" , :]
holiday_regional
| date | type | locale | locale_name | description | transferred | |
|---|---|---|---|---|---|---|
| 1 | 2012-04-01 | Holiday | Regional | Cotopaxi | Provincializacion de Cotopaxi | False |
| 7 | 2012-06-25 | Holiday | Regional | Imbabura | Provincializacion de Imbabura | False |
| 23 | 2012-11-06 | Holiday | Regional | Santo Domingo de los Tsachilas | Provincializacion de Santo Domingo | False |
| 24 | 2012-11-07 | Holiday | Regional | Santa Elena | Provincializacion Santa Elena | False |
| 47 | 2013-04-01 | Holiday | Regional | Cotopaxi | Provincializacion de Cotopaxi | False |
| 58 | 2013-06-25 | Holiday | Regional | Imbabura | Provincializacion de Imbabura | False |
| 76 | 2013-11-06 | Holiday | Regional | Santo Domingo de los Tsachilas | Provincializacion de Santo Domingo | False |
| 77 | 2013-11-07 | Holiday | Regional | Santa Elena | Provincializacion Santa Elena | False |
| 96 | 2014-04-01 | Holiday | Regional | Cotopaxi | Provincializacion de Cotopaxi | False |
| 112 | 2014-06-25 | Holiday | Regional | Imbabura | Provincializacion de Imbabura | False |
| 139 | 2014-11-06 | Holiday | Regional | Santo Domingo de los Tsachilas | Provincializacion de Santo Domingo | False |
| 140 | 2014-11-07 | Holiday | Regional | Santa Elena | Provincializacion Santa Elena | False |
| 165 | 2015-04-01 | Holiday | Regional | Cotopaxi | Provincializacion de Cotopaxi | False |
| 177 | 2015-06-25 | Holiday | Regional | Imbabura | Provincializacion de Imbabura | False |
| 193 | 2015-11-06 | Holiday | Regional | Santo Domingo de los Tsachilas | Provincializacion de Santo Domingo | False |
| 194 | 2015-11-07 | Holiday | Regional | Santa Elena | Provincializacion Santa Elena | False |
| 216 | 2016-04-01 | Holiday | Regional | Cotopaxi | Provincializacion de Cotopaxi | False |
| 259 | 2016-06-25 | Holiday | Regional | Imbabura | Provincializacion de Imbabura | False |
| 278 | 2016-11-06 | Holiday | Regional | Santo Domingo de los Tsachilas | Provincializacion de Santo Domingo | False |
| 279 | 2016-11-07 | Holiday | Regional | Santa Elena | Provincializacion Santa Elena | False |
| 302 | 2017-04-01 | Holiday | Regional | Cotopaxi | Provincializacion de Cotopaxi | False |
| 315 | 2017-06-25 | Holiday | Regional | Imbabura | Provincializacion de Imbabura | False |
| 334 | 2017-11-06 | Holiday | Regional | Santo Domingo de los Tsachilas | Provincializacion de Santo Domingo | False |
| 335 | 2017-11-07 | Holiday | Regional | Santa Elena | Provincializacion Santa Elena | False |
df3_regional = pd.merge(left=df3_national, right=holiday_regional[["date","locale_name"]], how="left",
left_on=['date','state'],
right_on=['date','locale_name'])
df3_regional
| date | store_nbr | family | sales | onpromotion | gas_price | city | state | type | cluster | locale | national_holiday | locale_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2013-01-01 | 1 | AUTOMOTIVE | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National | Yes | NaN |
| 1 | 2013-01-01 | 1 | BABY CARE | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National | Yes | NaN |
| 2 | 2013-01-01 | 1 | BEAUTY | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National | Yes | NaN |
| 3 | 2013-01-01 | 1 | BEVERAGES | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National | Yes | NaN |
| 4 | 2013-01-01 | 1 | BOOKS | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National | Yes | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3036523 | 2017-08-31 | 9 | POULTRY | NaN | 1 | 47.26 | Quito | Pichincha | B | 6 | 0 | No | NaN |
| 3036524 | 2017-08-31 | 9 | PREPARED FOODS | NaN | 0 | 47.26 | Quito | Pichincha | B | 6 | 0 | No | NaN |
| 3036525 | 2017-08-31 | 9 | PRODUCE | NaN | 1 | 47.26 | Quito | Pichincha | B | 6 | 0 | No | NaN |
| 3036526 | 2017-08-31 | 9 | SCHOOL AND OFFICE SUPPLIES | NaN | 9 | 47.26 | Quito | Pichincha | B | 6 | 0 | No | NaN |
| 3036527 | 2017-08-31 | 9 | SEAFOOD | NaN | 0 | 47.26 | Quito | Pichincha | B | 6 | 0 | No | NaN |
3036528 rows × 13 columns
df3_regional.locale_name.value_counts()
Santo Domingo de los Tsachilas 396 Cotopaxi 330 Imbabura 165 Santa Elena 132 Name: locale_name, dtype: int64
df3_regional["regional_holiday"] = df3_regional.locale_name.fillna(0).map(lambda x: 'No' if x==0 else 'Yes')
df3_regional
| date | store_nbr | family | sales | onpromotion | gas_price | city | state | type | cluster | locale | national_holiday | locale_name | regional_holiday | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2013-01-01 | 1 | AUTOMOTIVE | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National | Yes | NaN | No |
| 1 | 2013-01-01 | 1 | BABY CARE | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National | Yes | NaN | No |
| 2 | 2013-01-01 | 1 | BEAUTY | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National | Yes | NaN | No |
| 3 | 2013-01-01 | 1 | BEVERAGES | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National | Yes | NaN | No |
| 4 | 2013-01-01 | 1 | BOOKS | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National | Yes | NaN | No |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3036523 | 2017-08-31 | 9 | POULTRY | NaN | 1 | 47.26 | Quito | Pichincha | B | 6 | 0 | No | NaN | No |
| 3036524 | 2017-08-31 | 9 | PREPARED FOODS | NaN | 0 | 47.26 | Quito | Pichincha | B | 6 | 0 | No | NaN | No |
| 3036525 | 2017-08-31 | 9 | PRODUCE | NaN | 1 | 47.26 | Quito | Pichincha | B | 6 | 0 | No | NaN | No |
| 3036526 | 2017-08-31 | 9 | SCHOOL AND OFFICE SUPPLIES | NaN | 9 | 47.26 | Quito | Pichincha | B | 6 | 0 | No | NaN | No |
| 3036527 | 2017-08-31 | 9 | SEAFOOD | NaN | 0 | 47.26 | Quito | Pichincha | B | 6 | 0 | No | NaN | No |
3036528 rows × 14 columns
# add local holidays:
holiday_local=holiday.loc[holiday['locale'] == "Local" , :]
holiday_local
| date | type | locale | locale_name | description | transferred | |
|---|---|---|---|---|---|---|
| 0 | 2012-03-02 | Holiday | Local | Manta | Fundacion de Manta | False |
| 2 | 2012-04-12 | Holiday | Local | Cuenca | Fundacion de Cuenca | False |
| 3 | 2012-04-14 | Holiday | Local | Libertad | Cantonizacion de Libertad | False |
| 4 | 2012-04-21 | Holiday | Local | Riobamba | Cantonizacion de Riobamba | False |
| 5 | 2012-05-12 | Holiday | Local | Puyo | Cantonizacion del Puyo | False |
| ... | ... | ... | ... | ... | ... | ... |
| 339 | 2017-12-05 | Additional | Local | Quito | Fundacion de Quito-1 | False |
| 340 | 2017-12-06 | Holiday | Local | Quito | Fundacion de Quito | True |
| 341 | 2017-12-08 | Holiday | Local | Loja | Fundacion de Loja | False |
| 342 | 2017-12-08 | Transfer | Local | Quito | Traslado Fundacion de Quito | False |
| 344 | 2017-12-22 | Holiday | Local | Salinas | Cantonizacion de Salinas | False |
152 rows × 6 columns
df3_local = pd.merge(left=df3_regional, right=holiday_local[["date","locale_name"]], how="left",
left_on=['date','city'],
right_on=['date','locale_name'])
df3_local
| date | store_nbr | family | sales | onpromotion | gas_price | city | state | type | cluster | locale | national_holiday | locale_name_x | regional_holiday | locale_name_y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2013-01-01 | 1 | AUTOMOTIVE | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National | Yes | NaN | No | NaN |
| 1 | 2013-01-01 | 1 | BABY CARE | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National | Yes | NaN | No | NaN |
| 2 | 2013-01-01 | 1 | BEAUTY | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National | Yes | NaN | No | NaN |
| 3 | 2013-01-01 | 1 | BEVERAGES | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National | Yes | NaN | No | NaN |
| 4 | 2013-01-01 | 1 | BOOKS | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National | Yes | NaN | No | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3036787 | 2017-08-31 | 9 | POULTRY | NaN | 1 | 47.26 | Quito | Pichincha | B | 6 | 0 | No | NaN | No | NaN |
| 3036788 | 2017-08-31 | 9 | PREPARED FOODS | NaN | 0 | 47.26 | Quito | Pichincha | B | 6 | 0 | No | NaN | No | NaN |
| 3036789 | 2017-08-31 | 9 | PRODUCE | NaN | 1 | 47.26 | Quito | Pichincha | B | 6 | 0 | No | NaN | No | NaN |
| 3036790 | 2017-08-31 | 9 | SCHOOL AND OFFICE SUPPLIES | NaN | 9 | 47.26 | Quito | Pichincha | B | 6 | 0 | No | NaN | No | NaN |
| 3036791 | 2017-08-31 | 9 | SEAFOOD | NaN | 0 | 47.26 | Quito | Pichincha | B | 6 | 0 | No | NaN | No | NaN |
3036792 rows × 15 columns
df3_local['local_holiday'] = df3_local['locale_name_y'].fillna(0).map(lambda x: 'No' if x==0 else 'Yes')
df3_local
| date | store_nbr | family | sales | onpromotion | gas_price | city | state | type | cluster | locale | national_holiday | locale_name_x | regional_holiday | locale_name_y | local_holiday | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2013-01-01 | 1 | AUTOMOTIVE | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National | Yes | NaN | No | NaN | No |
| 1 | 2013-01-01 | 1 | BABY CARE | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National | Yes | NaN | No | NaN | No |
| 2 | 2013-01-01 | 1 | BEAUTY | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National | Yes | NaN | No | NaN | No |
| 3 | 2013-01-01 | 1 | BEVERAGES | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National | Yes | NaN | No | NaN | No |
| 4 | 2013-01-01 | 1 | BOOKS | 0.0 | 0 | 93.14 | Quito | Pichincha | D | 13 | National | Yes | NaN | No | NaN | No |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3036787 | 2017-08-31 | 9 | POULTRY | NaN | 1 | 47.26 | Quito | Pichincha | B | 6 | 0 | No | NaN | No | NaN | No |
| 3036788 | 2017-08-31 | 9 | PREPARED FOODS | NaN | 0 | 47.26 | Quito | Pichincha | B | 6 | 0 | No | NaN | No | NaN | No |
| 3036789 | 2017-08-31 | 9 | PRODUCE | NaN | 1 | 47.26 | Quito | Pichincha | B | 6 | 0 | No | NaN | No | NaN | No |
| 3036790 | 2017-08-31 | 9 | SCHOOL AND OFFICE SUPPLIES | NaN | 9 | 47.26 | Quito | Pichincha | B | 6 | 0 | No | NaN | No | NaN | No |
| 3036791 | 2017-08-31 | 9 | SEAFOOD | NaN | 0 | 47.26 | Quito | Pichincha | B | 6 | 0 | No | NaN | No | NaN | No |
3036792 rows × 16 columns
df3_local['national_holiday'].value_counts()
No 2774838 Yes 261954 Name: national_holiday, dtype: int64
df3_local['regional_holiday'].value_counts()
No 3035769 Yes 1023 Name: regional_holiday, dtype: int64
df3_local['local_holiday'].value_counts()
No 3024219 Yes 12573 Name: local_holiday, dtype: int64
df3 = df3_local[['date','store_nbr','type','cluster','family',
'sales','onpromotion','gas_price',
'national_holiday', 'regional_holiday', 'local_holiday']]
column_name = ['date','store_number','store_type','store_cluster','product_family',
'sales_volume','items_on_promotion','gas_price',
'national_holiday', 'regional_holiday', 'local_holiday']
df3.columns = column_name
df3['date'] = pd.to_datetime(df3['date'], format = '%Y-%m-%d')
/var/folders/s3/py6b6qd51n1bxdr0pnbsjwy40000gn/T/ipykernel_98229/1179531089.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df3['date'] = pd.to_datetime(df3['date'], format = '%Y-%m-%d')
data = df3.drop_duplicates(keep='first')
# add some date columns: year, month, day_of_week
data['year']=data['date'].dt.year.astype('category')
/var/folders/s3/py6b6qd51n1bxdr0pnbsjwy40000gn/T/ipykernel_98229/3995476843.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
data['year']=data['date'].dt.year.astype('category')
data['month']=data['date'].dt.month_name().astype('category')
/var/folders/s3/py6b6qd51n1bxdr0pnbsjwy40000gn/T/ipykernel_98229/328802076.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
data['month']=data['date'].dt.month_name().astype('category')
data['day_of_week'] = data['date'].dt.day_name().astype('category')
/var/folders/s3/py6b6qd51n1bxdr0pnbsjwy40000gn/T/ipykernel_98229/405239943.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
data['day_of_week'] = data['date'].dt.day_name().astype('category')
data.sample(50)
| date | store_number | store_type | store_cluster | product_family | sales_volume | items_on_promotion | gas_price | national_holiday | regional_holiday | local_holiday | day_of_week | year | month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1115064 | 2014-09-19 | 45 | A | 11 | PLAYERS AND ELECTRONICS | 12.000000 | 0 | 92.430000 | No | No | No | Friday | 2014 | September |
| 2656217 | 2017-01-30 | 30 | C | 3 | HARDWARE | 0.000000 | 0 | 52.630000 | No | No | No | Monday | 2017 | January |
| 1765111 | 2015-09-19 | 35 | C | 3 | CLEANING | 561.000000 | 4 | 45.363333 | No | No | No | Saturday | 2015 | September |
| 359617 | 2013-07-21 | 49 | A | 11 | HOME AND KITCHEN II | 0.000000 | 0 | 107.073333 | No | No | No | Sunday | 2013 | July |
| 300118 | 2013-06-18 | 3 | D | 8 | HOME AND KITCHEN II | 0.000000 | 0 | 98.460000 | No | No | No | Tuesday | 2013 | June |
| 310717 | 2013-06-24 | 27 | D | 1 | LIQUOR,WINE,BEER | 37.000000 | 0 | 95.070000 | No | No | No | Monday | 2013 | June |
| 1968821 | 2016-01-12 | 50 | A | 14 | DAIRY | 849.000000 | 12 | 30.420000 | No | No | No | Tuesday | 2016 | January |
| 2321873 | 2016-07-25 | 49 | A | 11 | PET SUPPLIES | 22.000000 | 0 | 42.400000 | No | No | No | Monday | 2016 | July |
| 2373955 | 2016-08-24 | 11 | B | 6 | BABY CARE | 1.000000 | 0 | 46.290000 | No | No | No | Wednesday | 2016 | August |
| 222450 | 2013-05-05 | 5 | D | 4 | PRODUCE | 14.000000 | 0 | 95.616667 | No | No | No | Sunday | 2013 | May |
| 3003496 | 2017-08-13 | 25 | D | 1 | BABY CARE | 0.000000 | 0 | 47.996667 | No | No | No | Sunday | 2017 | August |
| 423953 | 2013-08-26 | 54 | C | 3 | BEAUTY | 0.000000 | 0 | 105.880000 | No | No | No | Monday | 2013 | August |
| 2892389 | 2017-06-11 | 8 | D | 8 | BREAD/BAKERY | 1101.784000 | 0 | 46.006667 | No | No | No | Sunday | 2017 | June |
| 1888426 | 2015-11-27 | 45 | A | 11 | BABY CARE | 0.000000 | 0 | 40.570000 | Yes | No | No | Friday | 2015 | November |
| 2571188 | 2016-12-12 | 44 | A | 5 | PET SUPPLIES | 25.000000 | 1 | 52.740000 | No | No | No | Monday | 2016 | December |
| 1261070 | 2014-12-10 | 42 | D | 2 | DAIRY | 0.000000 | 0 | 60.990000 | No | No | No | Wednesday | 2014 | December |
| 103271 | 2013-02-27 | 7 | D | 8 | HARDWARE | 3.000000 | 0 | 92.840000 | No | No | No | Wednesday | 2013 | February |
| 1620868 | 2015-06-30 | 38 | D | 4 | CLEANING | 1049.000000 | 2 | 59.480000 | No | No | No | Tuesday | 2015 | June |
| 1335575 | 2015-01-21 | 32 | C | 3 | SEAFOOD | 0.000000 | 0 | 47.850000 | No | No | No | Wednesday | 2015 | January |
| 2646151 | 2017-01-24 | 48 | A | 14 | GROCERY II | 21.000000 | 0 | 52.380000 | No | No | No | Tuesday | 2017 | January |
| 254365 | 2013-05-23 | 46 | A | 14 | BABY CARE | 0.000000 | 0 | 94.120000 | No | No | No | Thursday | 2013 | May |
| 2631120 | 2017-01-16 | 26 | D | 10 | PRODUCE | 574.673000 | 4 | 52.427500 | No | No | No | Monday | 2017 | January |
| 2336432 | 2016-08-02 | 8 | D | 8 | SEAFOOD | 49.267002 | 0 | 39.500000 | No | No | No | Tuesday | 2016 | August |
| 2598389 | 2016-12-28 | 9 | B | 6 | BEAUTY | 14.000000 | 1 | 54.010000 | No | No | No | Wednesday | 2016 | December |
| 1593304 | 2015-06-15 | 14 | C | 7 | SCHOOL AND OFFICE SUPPLIES | 0.000000 | 0 | 59.530000 | No | No | No | Monday | 2015 | June |
| 1368961 | 2015-02-09 | 2 | D | 13 | LIQUOR,WINE,BEER | 63.000000 | 0 | 52.990000 | No | No | No | Monday | 2015 | February |
| 1598559 | 2015-06-18 | 12 | C | 15 | CELEBRATION | 13.000000 | 0 | 60.410000 | No | No | No | Thursday | 2015 | June |
| 2909726 | 2017-06-21 | 43 | E | 10 | HOME APPLIANCES | 0.000000 | 0 | 42.480000 | No | No | No | Wednesday | 2017 | June |
| 3023649 | 2017-08-24 | 4 | D | 9 | MEATS | NaN | 0 | 47.240000 | No | No | No | Thursday | 2017 | August |
| 1284792 | 2014-12-23 | 9 | B | 6 | BEVERAGES | 2682.000000 | 4 | 56.780000 | Yes | No | No | Tuesday | 2014 | December |
| 527198 | 2013-10-23 | 50 | A | 14 | MAGAZINES | 0.000000 | 0 | 96.900000 | No | No | No | Wednesday | 2013 | October |
| 473580 | 2013-09-23 | 46 | A | 14 | PRODUCE | 5.000000 | 0 | 103.620000 | No | No | No | Monday | 2013 | September |
| 848148 | 2014-04-22 | 7 | D | 8 | HOME AND KITCHEN I | 0.000000 | 0 | 101.690000 | No | No | No | Tuesday | 2014 | April |
| 969796 | 2014-06-30 | 2 | D | 13 | PERSONAL CARE | 264.000000 | 0 | 106.070000 | Yes | No | No | Monday | 2014 | June |
| 5601 | 2013-01-04 | 16 | C | 3 | MEATS | 13.625000 | 0 | 93.120000 | No | No | No | Friday | 2013 | January |
| 2226219 | 2016-06-02 | 23 | D | 9 | CELEBRATION | 8.000000 | 0 | 49.140000 | No | No | No | Thursday | 2016 | June |
| 1042489 | 2014-08-10 | 1 | D | 13 | LADIESWEAR | 0.000000 | 0 | 97.930000 | Yes | No | No | Sunday | 2014 | August |
| 701211 | 2014-01-30 | 33 | C | 3 | PLAYERS AND ELECTRONICS | 3.000000 | 0 | 98.250000 | No | No | No | Thursday | 2014 | January |
| 1156391 | 2014-10-12 | 6 | D | 13 | BREAD/BAKERY | 793.818000 | 1 | 85.776667 | No | No | No | Sunday | 2014 | October |
| 1197895 | 2014-11-05 | 2 | D | 13 | POULTRY | 569.457000 | 0 | 78.710000 | No | No | No | Wednesday | 2014 | November |
| 2404585 | 2016-09-10 | 20 | B | 6 | CLEANING | 1766.000000 | 35 | 46.013333 | No | No | No | Saturday | 2016 | September |
| 497567 | 2013-10-07 | 2 | D | 13 | PET SUPPLIES | 0.000000 | 0 | 103.070000 | No | No | No | Monday | 2013 | October |
| 405704 | 2013-08-16 | 42 | D | 2 | BEAUTY | 0.000000 | 0 | 107.580000 | No | No | No | Friday | 2013 | August |
| 224970 | 2013-05-07 | 21 | B | 6 | DELI | 0.000000 | 0 | 95.280000 | No | No | No | Tuesday | 2013 | May |
| 1988509 | 2016-01-23 | 52 | A | 11 | POULTRY | 0.000000 | 0 | 31.483333 | No | No | No | Saturday | 2016 | January |
| 66152 | 2013-02-07 | 15 | C | 15 | LAWN AND GARDEN | 0.000000 | 0 | 95.840000 | No | No | No | Thursday | 2013 | February |
| 2538739 | 2016-11-24 | 34 | B | 6 | HOME AND KITCHEN II | 24.000000 | 1 | 46.720000 | No | No | No | Thursday | 2016 | November |
| 268872 | 2013-05-31 | 52 | A | 11 | LINGERIE | 0.000000 | 0 | 91.930000 | No | No | No | Friday | 2013 | May |
| 812602 | 2014-04-03 | 1 | D | 13 | EGGS | 105.000000 | 0 | 100.290000 | No | No | No | Thursday | 2014 | April |
| 1497485 | 2015-04-22 | 26 | D | 10 | FROZEN FOODS | 45.000000 | 2 | 56.170000 | No | No | No | Wednesday | 2015 | April |
data.dtypes
date datetime64[ns] store_number int64 store_type object store_cluster int64 product_family object sales_volume float64 items_on_promotion int64 gas_price float64 national_holiday object regional_holiday object local_holiday object day_of_week category year category month category dtype: object
category_columns = ['store_number', 'store_type', 'store_cluster', 'product_family','year', 'month', 'day_of_week', 'national_holiday', 'regional_holiday','local_holiday']
data.loc[:, category_columns] = data.loc[:,category_columns].apply(lambda x: x.astype('category'))
/var/folders/s3/py6b6qd51n1bxdr0pnbsjwy40000gn/T/ipykernel_98229/2128232577.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
data.loc[:, category_columns] = data.loc[:,category_columns].apply(lambda x: x.astype('category'))
data.dtypes
date datetime64[ns] store_number category store_type category store_cluster category product_family category sales_volume float64 items_on_promotion int64 gas_price float64 national_holiday category regional_holiday category local_holiday category day_of_week category year category month category dtype: object
sns.heatmap(data.corr(),annot=True )
<AxesSubplot:>
sns.scatterplot(data=data, x='items_on_promotion',y='sales_volume')
<AxesSubplot:xlabel='items_on_promotion', ylabel='sales_volume'>
sns.scatterplot(data=data, x='gas_price',y='sales_volume')
<AxesSubplot:xlabel='gas_price', ylabel='sales_volume'>
dataset = data.query('sales_volume!=0.0')
# train dataset:
train = dataset.loc[data['sales_volume'].notnull(),:]
X_train = train.drop(columns=['sales_volume','date','store_type','store_cluster'])
y_train = train['sales_volume']
# test dataset:
test = dataset.loc[data['sales_volume'].isnull(),:]
X_test = test.drop(columns=['sales_volume','date','store_type','store_cluster'])
from statsmodels.formula.api import ols
data.columns
Index(['date', 'store_number', 'store_type', 'store_cluster', 'product_family',
'sales_volume', 'items_on_promotion', 'gas_price', 'national_holiday',
'regional_holiday', 'local_holiday', 'day_of_week', 'year', 'month'],
dtype='object')
model = ols("np.log(sales_volume) ~ store_number + product_family + items_on_promotion + gas_price + national_holiday + regional_holiday + local_holiday + day_of_week + year + month", data=train).fit()
print(model.summary())
OLS Regression Results
================================================================================
Dep. Variable: np.log(sales_volume) R-squared: 0.893
Model: OLS Adj. R-squared: 0.893
Method: Least Squares F-statistic: 1.549e+05
Date: Mon, 01 Aug 2022 Prob (F-statistic): 0.00
Time: 09:58:31 Log-Likelihood: -2.3772e+06
No. Observations: 2061758 AIC: 4.755e+06
Df Residuals: 2061646 BIC: 4.756e+06
Df Model: 111
Covariance Type: nonrobust
================================================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------------------------
Intercept 1.3455 0.009 149.419 0.000 1.328 1.363
store_number[T.2] 0.3367 0.005 64.526 0.000 0.326 0.347
store_number[T.3] 1.1616 0.005 224.954 0.000 1.152 1.172
store_number[T.4] 0.1665 0.005 31.795 0.000 0.156 0.177
store_number[T.5] 0.1071 0.005 20.469 0.000 0.097 0.117
store_number[T.6] 0.4671 0.005 89.202 0.000 0.457 0.477
store_number[T.7] 0.5024 0.005 96.524 0.000 0.492 0.513
store_number[T.8] 0.7540 0.005 145.263 0.000 0.744 0.764
store_number[T.9] 0.3694 0.005 70.710 0.000 0.359 0.380
store_number[T.10] -0.7124 0.005 -130.977 0.000 -0.723 -0.702
store_number[T.11] 0.2475 0.005 47.036 0.000 0.237 0.258
store_number[T.12] -0.4503 0.005 -82.550 0.000 -0.461 -0.440
store_number[T.13] -0.6395 0.005 -117.546 0.000 -0.650 -0.629
store_number[T.14] -0.4347 0.005 -79.430 0.000 -0.445 -0.424
store_number[T.15] -0.4272 0.005 -79.557 0.000 -0.438 -0.417
store_number[T.16] -0.4997 0.005 -91.332 0.000 -0.510 -0.489
store_number[T.17] -0.0539 0.005 -10.010 0.000 -0.064 -0.043
store_number[T.18] -0.1656 0.005 -30.605 0.000 -0.176 -0.155
store_number[T.19] -0.4186 0.005 -78.049 0.000 -0.429 -0.408
store_number[T.20] -0.0242 0.006 -3.970 0.000 -0.036 -0.012
store_number[T.21] -0.0821 0.006 -12.772 0.000 -0.095 -0.070
store_number[T.22] -0.5883 0.007 -85.176 0.000 -0.602 -0.575
store_number[T.23] -0.2774 0.005 -51.977 0.000 -0.288 -0.267
store_number[T.24] 0.4624 0.005 87.303 0.000 0.452 0.473
store_number[T.25] -0.2982 0.005 -55.567 0.000 -0.309 -0.288
store_number[T.26] -0.7841 0.005 -145.462 0.000 -0.795 -0.774
store_number[T.27] 0.1814 0.005 34.711 0.000 0.171 0.192
store_number[T.28] -0.0116 0.005 -2.184 0.029 -0.022 -0.001
store_number[T.29] -0.1235 0.006 -19.851 0.000 -0.136 -0.111
store_number[T.30] -0.7480 0.005 -137.083 0.000 -0.759 -0.737
store_number[T.31] -0.0883 0.005 -16.662 0.000 -0.099 -0.078
store_number[T.32] -1.0893 0.006 -194.729 0.000 -1.100 -1.078
store_number[T.33] -0.1955 0.005 -35.932 0.000 -0.206 -0.185
store_number[T.34] -0.0874 0.005 -16.578 0.000 -0.098 -0.077
store_number[T.35] -0.8988 0.006 -161.014 0.000 -0.910 -0.888
store_number[T.36] -0.2108 0.005 -38.512 0.000 -0.222 -0.200
store_number[T.37] 0.2857 0.005 54.768 0.000 0.275 0.296
store_number[T.38] 0.2261 0.005 43.276 0.000 0.216 0.236
store_number[T.39] 0.1407 0.005 26.941 0.000 0.130 0.151
store_number[T.40] -0.1553 0.005 -28.772 0.000 -0.166 -0.145
store_number[T.41] -0.0083 0.005 -1.588 0.112 -0.019 0.002
store_number[T.42] 0.1347 0.006 20.791 0.000 0.122 0.147
store_number[T.43] -0.2468 0.005 -45.158 0.000 -0.258 -0.236
store_number[T.44] 1.3486 0.005 261.301 0.000 1.338 1.359
store_number[T.45] 1.1438 0.005 220.695 0.000 1.134 1.154
store_number[T.46] 0.8637 0.005 166.673 0.000 0.854 0.874
store_number[T.47] 1.0795 0.005 208.540 0.000 1.069 1.090
store_number[T.48] 0.6827 0.005 130.937 0.000 0.672 0.693
store_number[T.49] 0.9463 0.005 182.597 0.000 0.936 0.956
store_number[T.50] 0.5762 0.005 110.755 0.000 0.566 0.586
store_number[T.51] 0.6281 0.005 120.917 0.000 0.618 0.638
store_number[T.52] 0.6498 0.013 48.521 0.000 0.624 0.676
store_number[T.53] -0.2024 0.006 -35.533 0.000 -0.214 -0.191
store_number[T.54] -0.6133 0.005 -111.998 0.000 -0.624 -0.603
product_family[T.BABY CARE] -1.1534 0.011 -106.097 0.000 -1.175 -1.132
product_family[T.BEAUTY] -0.4876 0.004 -121.189 0.000 -0.496 -0.480
product_family[T.BEVERAGES] 5.8800 0.004 1536.894 0.000 5.873 5.888
product_family[T.BOOKS] -1.6696 0.015 -112.276 0.000 -1.699 -1.640
product_family[T.BREAD/BAKERY] 4.3549 0.004 1145.111 0.000 4.347 4.362
product_family[T.CELEBRATION] 0.5985 0.004 137.251 0.000 0.590 0.607
product_family[T.CLEANING] 5.2675 0.004 1381.249 0.000 5.260 5.275
product_family[T.DAIRY] 4.6604 0.004 1221.097 0.000 4.653 4.668
product_family[T.DELI] 3.8252 0.004 1003.864 0.000 3.818 3.833
product_family[T.EGGS] 3.3146 0.004 872.086 0.000 3.307 3.322
product_family[T.FROZEN FOODS] 2.9191 0.004 768.136 0.000 2.912 2.927
product_family[T.GROCERY I] 6.4008 0.004 1634.661 0.000 6.393 6.409
product_family[T.GROCERY II] 0.9919 0.004 258.169 0.000 0.984 0.999
product_family[T.HARDWARE] -1.1542 0.004 -259.242 0.000 -1.163 -1.146
product_family[T.HOME AND KITCHEN I] 1.3623 0.004 326.526 0.000 1.354 1.370
product_family[T.HOME AND KITCHEN II] 1.1153 0.004 267.231 0.000 1.107 1.123
product_family[T.HOME APPLIANCES] -1.4861 0.006 -263.069 0.000 -1.497 -1.475
product_family[T.HOME CARE] 3.8177 0.004 888.180 0.000 3.809 3.826
product_family[T.LADIESWEAR] 0.5387 0.005 110.542 0.000 0.529 0.548
product_family[T.LAWN AND GARDEN] -0.0471 0.004 -10.603 0.000 -0.056 -0.038
product_family[T.LINGERIE] 0.1252 0.004 32.210 0.000 0.118 0.133
product_family[T.LIQUOR,WINE,BEER] 2.5024 0.004 639.785 0.000 2.495 2.510
product_family[T.MAGAZINES] -0.4514 0.005 -95.216 0.000 -0.461 -0.442
product_family[T.MEATS] 3.9396 0.004 1036.065 0.000 3.932 3.947
product_family[T.PERSONAL CARE] 3.8681 0.004 1017.526 0.000 3.861 3.876
product_family[T.PET SUPPLIES] -0.2070 0.005 -44.292 0.000 -0.216 -0.198
product_family[T.PLAYERS AND ELECTRONICS] 0.2599 0.004 59.200 0.000 0.251 0.269
product_family[T.POULTRY] 3.8923 0.004 1023.929 0.000 3.885 3.900
product_family[T.PREPARED FOODS] 2.6472 0.004 696.674 0.000 2.640 2.655
product_family[T.PRODUCE] 4.3704 0.004 1062.705 0.000 4.362 4.378
product_family[T.SCHOOL AND OFFICE SUPPLIES] -0.6847 0.006 -119.871 0.000 -0.696 -0.674
product_family[T.SEAFOOD] 0.8532 0.004 217.187 0.000 0.846 0.861
national_holiday[T.Yes] 0.1214 0.002 60.463 0.000 0.117 0.125
regional_holiday[T.Yes] 0.0863 0.030 2.919 0.004 0.028 0.144
local_holiday[T.Yes] -0.0210 0.008 -2.539 0.011 -0.037 -0.005
day_of_week[T.Monday] -0.0105 0.002 -5.233 0.000 -0.014 -0.007
day_of_week[T.Saturday] 0.3028 0.002 151.936 0.000 0.299 0.307
day_of_week[T.Sunday] 0.2766 0.002 137.387 0.000 0.273 0.281
day_of_week[T.Thursday] -0.0990 0.002 -49.227 0.000 -0.103 -0.095
day_of_week[T.Tuesday] -0.0538 0.002 -26.855 0.000 -0.058 -0.050
day_of_week[T.Wednesday] -0.0673 0.002 -33.529 0.000 -0.071 -0.063
year[T.2014] 0.2383 0.002 126.963 0.000 0.235 0.242
year[T.2015] 0.2241 0.004 55.326 0.000 0.216 0.232
year[T.2016] 0.2524 0.004 57.396 0.000 0.244 0.261
year[T.2017] 0.3869 0.004 91.868 0.000 0.379 0.395
month[T.August] 0.0500 0.003 18.922 0.000 0.045 0.055
month[T.December] 0.2555 0.003 90.043 0.000 0.250 0.261
month[T.February] 0.0016 0.003 0.593 0.553 -0.004 0.007
month[T.January] 0.0417 0.003 16.091 0.000 0.037 0.047
month[T.July] 0.0636 0.003 25.189 0.000 0.059 0.069
month[T.June] 0.0308 0.003 11.929 0.000 0.026 0.036
month[T.March] 0.0455 0.003 17.803 0.000 0.040 0.050
month[T.May] -0.0021 0.003 -0.803 0.422 -0.007 0.003
month[T.November] 0.0633 0.003 22.764 0.000 0.058 0.069
month[T.October] 0.0873 0.003 32.151 0.000 0.082 0.093
month[T.September] 0.1059 0.003 38.530 0.000 0.101 0.111
items_on_promotion 0.0071 4.13e-05 171.654 0.000 0.007 0.007
gas_price -0.0022 7.33e-05 -29.315 0.000 -0.002 -0.002
==============================================================================
Omnibus: 694655.121 Durbin-Watson: 2.046
Prob(Omnibus): 0.000 Jarque-Bera (JB): 6912581.068
Skew: -1.336 Prob(JB): 0.00
Kurtosis: 11.563 Cond. No. 3.78e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.78e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
train['predicted_sales_volume'] = np.exp(model.predict(X_train))
/var/folders/s3/py6b6qd51n1bxdr0pnbsjwy40000gn/T/ipykernel_98229/1406944995.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy train['predicted_sales_volume'] = np.exp(model.predict(X_train))
train[['sales_volume'].plot(figsize=(16,8))
<AxesSubplot:>
test['predicted_sales_volume'] = np.ceil(np.exp(model.predict(X_test)))
/var/folders/s3/py6b6qd51n1bxdr0pnbsjwy40000gn/T/ipykernel_98229/656582870.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy test['predicted_sales_volume'] = np.ceil(np.exp(model.predict(X_test)))
sns.relplot(data=test, x='date' , y ='predicted_sales_volume', hue="product_family", markers=True, row='store_number' , kind='line', aspect = 2,
facet_kws={'sharey': False, 'sharex': False})
<seaborn.axisgrid.FacetGrid at 0x216e0ae20>
test.to_csv('forecast_final.csv')
train.to_csv('historical_final.csv')
data.to_csv('integrated_final.csv')